- Covid Tracking Dataset (w/ hospitalised data)
- An Exploratory data analysis of the US dataset
- 1. Basic triad of the dataset: investigating data types and number of rows per column
# Imports
import os
import pandas as pd
import csv
import kaggle
# other imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from copy import copy
import seaborn as sns
from scipy.stats import norm
import matplotlib.dates as mdates
# import matplotlib.colors as mcolors
# import random
# import math
# import time
# from sklearn.linear_model import LinearRegression, BayesianRidge
# from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
# from sklearn.svm import SVR
from datetime import date, datetime
from dateutil.parser import parse
import us
# import operator
# plt.style.use('fivethirtyeight')
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline
Covid Tracking Dataset (w/ hospitalised data)
Source: https://covidtracking.com/
all_cases = pd.read_csv('https://covidtracking.com/api/v1/states/daily.csv')
# Delete unecessary rows
for row in ['negative', 'pending', 'hash', 'negativeIncrease', 'totalTestResults', 'totalTestResultsIncrease', 'dateChecked', 'fips', 'inIcuCumulative', 'onVentilatorCumulative', 'total', 'posNeg', 'deathIncrease', 'hospitalizedIncrease', 'positiveIncrease']:
del all_cases[row]
# TODO missing values
# Do we get avg or missing values, or predict them?
# See https://developerzen.com/data-mining-handling-missing-values-the-database-bd2241882e72
for i, row in all_cases.iterrows():
# Set Dates
s = str(row['date'])
all_cases.at[i, 'date'] = date(year=int(s[0:4]), month=int(s[4:6]), day=int(s[6:8]))
# Missing death figures means no death reports yet
# These are set to 0
for i, row in all_cases.iterrows():
if np.isnan(row['death']):
all_cases.at[i, 'death'] = 0
# TODO Replace active cases with JHU and/or regression model (Selma)
all_cases['active'] = all_cases['positive'] - all_cases['recovered'] - all_cases['death']
# change location of 'active' column
cols = list(all_cases)
cols.insert(3, cols.pop(cols.index('active')))
all_cases = all_cases.loc[:, cols]
# Load datasets for US population and Hospital beds per 1000
us_population = pd.read_csv('data/us_population.csv')
hosp_beds = pd.read_csv('data/hospital_beds.csv')
state_abbrev = pd.read_csv('data/us_state_names.csv')
# add state abbreviations to us_population and hospital beds dataframe
for state in state_abbrev['State'].tolist():
# store state abbreviation in variable
abbrev = state_abbrev.loc[state_abbrev['State'] == state, 'Abbreviation'].tolist()[0]
# add abbrev to new column 'Abbreviation' in us_population df
us_population.loc[us_population['State'] == state, 'Abbreviation'] = abbrev
# add abbrev to new column in hosp_beds df
hosp_beds.loc[hosp_beds['Location'] == state, 'Abbreviation'] = abbrev
# change order of columns of us_population
cols = list(us_population)
cols.insert(2, cols.pop(cols.index('Abbreviation')))
us_population = us_population.loc[:, cols]
# drop unnecessary columns of us_population
us_population = us_population.drop(columns=['rank', 'Growth', 'Pop2018', 'Pop2010', 'growthSince2010', 'Percent', 'density'])
# drop unnecessary columns of hosp_beds
hosp_beds = hosp_beds.drop(columns=['Location', 'State/Local Government', 'Non-Profit', 'For-Profit'])
# change order of columns of hosp_beds
cols = list(hosp_beds)
cols.insert(0, cols.pop(cols.index('Abbreviation')))
hosp_beds = hosp_beds.loc[:, cols]
us_population.head()
hosp_beds.head()
# filter out non-existing states like 'AS'
all_cases = all_cases[all_cases['state'].isin(state_abbrev['Abbreviation'].tolist())]
# see what filtered dataframe looks like
all_cases.head()
# check which states have 0 positive cases
all_cases.loc[all_cases['positive'] == 0]
# Split dataframes by date
df_split_by_date = dict(tuple(all_cases.groupby('date')))
# Split dataframes by state
df_split_by_state = dict(tuple(all_cases.groupby('state')))
# merge dataframes us_population and all_cases
df_merge_uspop = all_cases.merge(us_population, how='left', left_on='state', right_on='Abbreviation')
df_merge_uspop = df_merge_uspop.drop(columns=['Abbreviation'])
df_merge_uspop = df_merge_uspop.rename(columns={'Pop': 'population'})
# change location of 'population' column
cols = list(df_merge_uspop)
cols.insert(2, cols.pop(cols.index('population')))
df_merge_uspop = df_merge_uspop.loc[:, cols]
# merge dataframes hosp_beds and df_merge_uspop
df_merge_hosp = df_merge_uspop.merge(hosp_beds, how='left', left_on='state', right_on='Abbreviation')
df_merge_hosp = df_merge_hosp.drop(columns=['Abbreviation'])
all_cases = df_merge_hosp.rename(columns={'Total': 'bedsPerThousand'})
all_cases.head()
# Calculate the total beds, and add the column
all_cases['total_beds'] = all_cases['population'] / 1000 * all_cases['bedsPerThousand']
# change abbreviations to state names
all_cases = all_cases.rename(columns={'state': 'abbrev'})
all_cases = all_cases.rename(columns={'State': 'state'})
# change location of 'state' column
cols = list(all_cases)
cols.insert(1, cols.pop(cols.index('state')))
all_cases = all_cases.loc[:, cols]
all_cases.head()
- Load and clean JHU data
- Merge JHU dataset with main dataset
# This cell takes some time, as it needs to connect to Kaggle Servers to retrieve data
kaggle.api.authenticate()
kaggle.api.dataset_download_files('benhamner/jhucovid19', path='./kaggle/input/jhucovid19/', unzip=True)
# Get Time-Series Data of cases as Pandas DataFrame
dir_jhu = './kaggle/input/jhucovid19/csse_covid_19_data/csse_covid_19_daily_reports'
df_list = []
for dirname, _, files in os.walk(dir_jhu):
for file in files:
if 'gitignore' not in file and 'README' not in file:
full_dir = os.path.join(dirname, file)
df_list.append(pd.read_csv(full_dir))
jhu_df = pd.concat(df_list, axis=0, ignore_index=True, sort=True)
# convert Last Update columns to datetime format
jhu_df.loc[:, 'Last Update'] = pd.to_datetime(jhu_df['Last Update']).apply(lambda x: x.date())
jhu_df.loc[:, 'Last_Update'] = pd.to_datetime(jhu_df['Last_Update']).apply(lambda x: x.date())
# Combine Last Update with Last_Update
jhu_df['LastUpdate'] = jhu_df['Last_Update'].combine_first(jhu_df['Last Update'])
# Combine Country/Region with Country_Region
jhu_df['CountryRegion'] = jhu_df['Country/Region'].combine_first(jhu_df['Country_Region'])
# Retrieve only US data
jhu_df = jhu_df[jhu_df['CountryRegion']=='US']
# Combine Province/State with Province_State
jhu_df['ProvinceState'] = jhu_df['Province/State'].combine_first(jhu_df['Province_State'])
# Drop unnecessary columns
jhu_df = jhu_df.drop(['Admin2', 'Lat', 'Latitude', 'Long_', 'Longitude', 'Combined_Key', 'Country/Region',
'Country_Region', 'Province/State', 'Province_State',
'Last Update', 'Last_Update', 'FIPS'], axis=1)
# Change column order
cols = list(jhu_df)
cols.insert(0, cols.pop(cols.index('CountryRegion')))
cols.insert(1, cols.pop(cols.index('ProvinceState')))
cols.insert(2, cols.pop(cols.index('LastUpdate')))
jhu_df = jhu_df.loc[:, cols]
# Change region to known US states
state_abbrs_dict = {}
for state in us.states.STATES:
state_abbrs_dict[state.abbr] = state.name
def toState(input_state, mapping):
abbreviation = input_state.rstrip()[-2:]
try:
return_value = mapping[abbreviation]
except KeyError:
return_value = input_state
return return_value
jhu_df['ProvinceState'] = jhu_df['ProvinceState'].apply(lambda x: toState(x, state_abbrs_dict) if x != 'Washington, D.C.' else 'District of Columbia')
# Filter out unknown states
jhu_df = jhu_df[jhu_df['ProvinceState'].isin(all_cases.state.unique().tolist())]
# Merge-sum rows with same date and State
jhu_df = jhu_df.groupby(['LastUpdate', 'ProvinceState']).agg(
{
'Active': sum,
'Confirmed': sum,
'Deaths': sum,
'Recovered': sum
}
).reset_index()
jhu_df.tail()
# Now that we have the JHU dataset relatively cleaned
# we can go ahead and merge its data with our main dataset
for i, row in all_cases.iterrows():
last_update = all_cases.at[i, 'date']
state = all_cases.at[i, 'state']
matching_row = jhu_df[jhu_df['ProvinceState'] == state]
matching_row = matching_row[matching_row['LastUpdate'] == last_update].reset_index()
if len(matching_row.values) > 0:
#all_cases.at[i, 'positive'] = matching_row['Confirmed'].values[0]
all_cases.at[i, 'active'] = matching_row['Active'].values[0]
#all_cases.at[i, 'recovered'] = matching_row['Recovered'].values[0] --- JHU was inconsistent, therefore removed
#all_cases.at[i, 'death'] = matching_row['Deaths'].values[0]
# Replace unknown recovery numbers with 0
if np.isnan(row['recovered']):
all_cases.at[i, 'recovered'] = 0
if all_cases.at[i, 'active'] == 0 or np.isnan(row['active']):
positive = all_cases.at[i, 'positive']
recovered = all_cases.at[i, 'recovered']
dead = all_cases.at[i, 'death']
all_cases.at[i, 'active'] = positive - recovered - dead
all_cases.tail()
# Save formatted dataset offline in case of disaster
dataset_file = 'results/all_cases.csv'
all_cases.to_csv(dataset_file)
# convert date to datetime format
all_cases['date'] = pd.to_datetime(all_cases['date'])
dataset_file = 'results/all_cases.csv'
covid_df = pd.read_csv(dataset_file, index_col=0)
# convert date to datetime format
covid_df['date'] = pd.to_datetime(covid_df['date'])
covid_df.info()
# set float format to 3 decimals
pd.set_option('display.float_format', lambda x: '%.3f' % x)
Conclusion: onVentilatorCurrently, inICUCurrently and Hospitalized and HospitalizationCumulative contain many NaN values. we further inspect the NaN values by printing the tail of the dataset.
covid_df.head()
The NaN values may indicate that there were no to few Covid-19 patients at these date points. We further analyse the statistical values of the dataset columns.
covid_df.describe()
# TODO rounding up the numbers
# drop unnecessary columns
covid_cleaned = covid_df.drop(['hospitalized', 'bedsPerThousand'], axis=1)
covid_100k = covid_cleaned.copy()
# list of columns to transform to per 100k
columns_list = ['positive', 'active', 'recovered', 'death', 'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently', 'onVentilatorCurrently', 'total_beds']
# add columns per 100k
for column in columns_list:
if column == 'total_beds':
covid_100k['BedsPer100k'.format(column)] = (covid_cleaned.loc[:, column] / covid_cleaned.loc[:, 'population']) * 100000
else:
covid_100k['{}_100k'.format(column)] = (covid_cleaned.loc[:, column] / covid_cleaned.loc[:, 'population']) * 100000
covid_100k = covid_100k.drop(columns_list, axis=1)
covid_100k['date'] = pd.to_datetime(covid_100k['date'])
start_date = '2020-04-18'
end_date = '2020-05-19'
mask = (covid_100k['date'] > start_date) & (covid_100k['date'] <= end_date)
covid_100k_last_month = covid_100k.loc[mask]
covid_100k_last_month_part1 = covid_100k_last_month.groupby('date').sum().loc[:, ['positive_100k','active_100k','recovered_100k','death_100k','hospitalizedCumulative_100k']].diff(periods=1, axis=0)
covid_100k_last_month_part2 = covid_100k_last_month.groupby('date').sum().loc[:, ['inIcuCurrently_100k','onVentilatorCurrently_100k','BedsPer100k']]
final_100k_last_month = covid_100k_last_month_part1.merge(covid_100k_last_month_part2, left_index=True, right_index=True)
final_100k_last_month.head()
final_100k_last_month.describe()
# save description cleaned dataset to csv
describe_file = 'results/final_100k_last_month.csv'
final_100k_last_month.describe().to_csv(describe_file)
# Omitting the categorical (states/abbreviations) and time columns
# There must be an easier way for you, but this was the easiest way I could think of
covid_cleaned['date'] = pd.to_datetime(covid_cleaned['date'])
# mask data for last month
start_date = '2020-04-18'
end_date = '2020-05-19'
mask = (covid_cleaned['date'] > start_date) & (covid_cleaned['date'] <= end_date)
covid_cleaned_last_month = covid_cleaned.loc[mask]
plot_df = covid_cleaned_last_month[['population', 'active', 'recovered', 'death', 'hospitalizedCurrently', 'inIcuCurrently', 'onVentilatorCurrently', 'total_beds']]
plot_df_last_month = covid_100k_last_month[['population', 'active_100k', 'recovered_100k', 'death_100k', 'hospitalizedCurrently_100k', 'inIcuCurrently_100k', 'onVentilatorCurrently_100k', 'BedsPer100k']]
timeseries_usa_df = covid_100k.loc[:, ['date', 'positive_100k', 'active_100k', 'recovered_100k', 'death_100k', 'hospitalizedCurrently_100k', 'inIcuCurrently_100k', 'onVentilatorCurrently_100k', 'BedsPer100k']].groupby('date').sum().reset_index()
# timeseries_usa_df['log_positive'] = np.log(timeseries_usa_df['positive_100k'])
# timeseries_usa_df['log_active'] = np.log(timeseries_usa_df['active_100k'])
# timeseries_usa_df['log_recovered'] = np.log(timeseries_usa_df['recovered_100k'])
# timeseries_usa_df['log_death'] = np.log(timeseries_usa_df['death_100k'])
timeseries_usa_df.tail()
x_dates = timeseries_usa_df['date'].tolist()
y_hospitalized = timeseries_usa_df['hospitalizedCurrently_100k'].tolist()
y_icu = timeseries_usa_df['inIcuCurrently_100k'].tolist()
y_pos = timeseries_usa_df['positive_100k'].tolist()
y_act = timeseries_usa_df['active_100k'].tolist()
y_rec = timeseries_usa_df['recovered_100k'].tolist()
y_death = timeseries_usa_df['death_100k'].tolist()
y_vent = timeseries_usa_df['onVentilatorCurrently_100k'].tolist()
y_beds = timeseries_usa_df['BedsPer100k'].tolist()
hospitalized_trace = go.Scatter(x=x_dates, y=y_hospitalized, name='Hospitalized')
icu_trace = go.Scatter(x=x_dates, y=y_icu, name='ICU')
pos_trace = go.Scatter(x=x_dates, y=y_pos, name='Positive')
act_trace = go.Scatter(x=x_dates, y=y_act, name='Active')
rec_trace = go.Scatter(x=x_dates, y=y_rec, name='Recovered')
death_trace = go.Scatter(x=x_dates, y=y_death, name='Death')
vent_trace = go.Scatter(x=x_dates, y=y_vent, name='On Ventilator')
beds_trace = go.Scatter(x=x_dates, y=y_beds, name='Beds')
layout = go.Layout(xaxis_title='Date', yaxis_title='Count/100k', title_x=0.5, xaxis_tickformat = '%d-%m-%Y')
fig = make_subplots(rows=1, cols=2)
fig = go.Figure([hospitalized_trace, icu_trace, vent_trace], layout=layout)
fig.show()
fig = go.Figure([pos_trace, act_trace, rec_trace, death_trace, beds_trace], layout=layout)
fig.show()
covid_100k
# Timeseries plot
fig, ax = plt.subplots(figsize = (16, 16))
ax.plot(covid_df.date, covid_df.hospitalizedCurrently)
ax.set_title('Number of USA Patients Currently in ICU')
# TODO change the X-axis to chip the year
# TODO which timeseries plots do we need?
# TODO how to temporarily remove the outliers?
# get data from last day
# plot_df_last_date = plot_df.loc[covid_df['date'] == '2020-05-18']
# Plotting histograms to gain insight of the distribution shape, skewness and scale
fig, axs = plt.subplots(4,2,figsize = (16, 16))
sns.set()
for i, column in enumerate(plot_df_last_month.columns):
if (i + 1) % 2 == 0:
ax = axs[(i//2), 1]
else:
ax = axs[(i//2), 0]
sns.distplot(plot_df_last_month[column], fit=norm, fit_kws=dict(label='normality'), hist_kws=dict(color='plum', edgecolor='k', linewidth=1, label='frequency'), ax=ax, color='#9d53ad')
ax.legend(loc='upper right')
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Looking at linearity and variance with scatterplots
# Removing the target variable and saving it in another df
target = plot_df.hospitalizedCurrently
indep_var = plot_df.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 12))
for i, col in enumerate(indep_var.columns):
ax=fig.add_subplot(4, 3, i+1)
sns.regplot(x=indep_var[col], y=target, data=indep_var, label=col, scatter_kws={'s':10}, line_kws={"color": "plum", 'label': 'hospitCurr'})
plt.suptitle('Scatterplots with Target HospitalizedCurrently', fontsize=18)
plt.legend()
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Assessing the normality of the distribution with a boxplot
# Boxplot with removed outliers
fig, ax = plt.subplots(figsize = (16, 12))
for i, col in enumerate(plot_df.columns):
ax=fig.add_subplot(4, 3, i+1)
sns.boxplot(x=plot_df[col], data=plot_df, color='lightblue', showfliers=False)
plt.suptitle('Boxplots of Independent Variables', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# get data from last day
plot_df_last_date = plot_df.loc[covid_df['date'] == '2020-05-18']
fig, ax = plt.subplots(figsize = (16, 12))
for i, col in enumerate(plot_df_last_date.columns):
ax=fig.add_subplot(4, 3, i+1)
sns.boxplot(x=plot_df_last_date[col], data=plot_df, color='lightblue', showfliers=True)
plt.suptitle('Boxplots of Independent Variables', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Split covid_df into subset with only NY values
new_york = covid_df.loc[covid_df['abbrev'] == 'NY']
fig, ax = plt.subplots(figsize = (16, 12))
# Timeseries plt
plt.plot(new_york.date, new_york.hospitalizedCurrently)
plt.title('Number of Patients in NY Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
# TODO the code below can be deleted once we fix the date format in plots
#set ticks every week
ax.xaxis.set_major_locator(mdates.WeekdayLocator())
#set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%m'))
# Omit the categorical and date cols
new_york = new_york[['positive', 'active', 'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
# Scatter plots NY
# Split dependent var from independent variables
target_ny = new_york.hospitalizedCurrently
indep_var_ny = new_york.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ny.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.regplot(x=indep_var_ny[col], y=target_ny, data=indep_var_ny, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
plt.suptitle('Distributions of Independent Variables NY', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of NY
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(new_york.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.boxplot(x=new_york[col], data=new_york, color='lightpink', showfliers=True)
plt.suptitle('Boxplots of Independent Variables NY', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
cali = covid_df.loc[(covid_df['abbrev'] == 'CA') & (covid_df['state']== 'California')]
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(cali.date, cali.hospitalizedCurrently)
plt.title('Number of Patients in CA Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
# Checking which cols have NaN values
cali[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
cali.head()
# Omit the NaN cols
cali = cali[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]
# Scatter plots CA
# Split dependent var from independent variables
target_ca = cali.hospitalizedCurrently
indep_var_ca = cali.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ca.columns):
ax=fig.add_subplot(2, 3, i+1)
sns.regplot(x=indep_var_ca[col], y=target_ca, data=indep_var_ca, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
plt.suptitle('Distributions of Independent Variables CA', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of CA
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(cali.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.boxplot(x=cali[col], data=cali, color='lightpink', showfliers=True)
plt.suptitle('Boxplots of Independent Variables CA', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
###endcali
texas = covid_df.loc[(covid_df['abbrev'] == 'TX') & (covid_df['state']== 'Texas')]
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(texas.date, texas.hospitalizedCurrently)
plt.title('Number of Patients in TX Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
# Checking which cols have NaN values
texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
texas.head()
# Omit the NaN cols
texas = texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]
# Scatter plots TX
# Split dependent var from independent variables
target_tx = texas.hospitalizedCurrently
indep_var_tx = texas.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_tx.columns):
ax=fig.add_subplot(2, 3, i+1)
sns.regplot(x=indep_var_tx[col], y=target_tx, data=indep_var_tx, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
plt.suptitle('Distributions of Independent Variables TX', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of TX
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(texas.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.boxplot(x=texas[col], data=texas, color='lightpink', showfliers=True)
plt.suptitle('Boxplots of Independent Variables TX', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
###endtx
sc = covid_df.loc[(covid_df['abbrev'] == 'SC') & (covid_df['state']== 'South Carolina')]
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(sc.date, sc.hospitalizedCurrently)
plt.title('Number of Patients in SC Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
# Checking which cols have NaN values
sc[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
sc.head()
# Omit the NaN cols
sc = sc[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]
# Scatter plots SC
# Split dependent var from independent variables
target_sc = sc.hospitalizedCurrently
indep_var_sc = sc.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_sc.columns):
ax=fig.add_subplot(2, 3, i+1)
sns.regplot(x=indep_var_sc[col], y=target_sc, data=indep_var_sc, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
plt.suptitle('Distributions of Independent Variables SC', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of SC
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(sc.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.boxplot(x=sc[col], data=sc, color='lightpink', showfliers=True)
plt.suptitle('Boxplots of Independent Variables TX', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
###endsouthcarolina
nevada = covid_df.loc[(covid_df['abbrev'] == 'NV') & (covid_df['state']== 'Nevada')]
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(nevada.date, nevada.hospitalizedCurrently)
plt.title('Number of Patients in NV Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
# Checking which cols have NaN values
nevada[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
nevada.head()
# Omit the NaN cols
nevada = nevada[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]
# Scatter plots NV
# Split dependent var from independent variables
target_nv = nevada.hospitalizedCurrently
indep_var_nv = nevada.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_nv.columns):
ax=fig.add_subplot(2, 3, i+1)
sns.regplot(x=indep_var_nv[col], y=target_nv, data=indep_var_nv, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
plt.suptitle('Distributions of Independent Variables NV', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of NV
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(nevada.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.boxplot(x=nevada[col], data=nevada, color='lightpink', showfliers=True)
plt.suptitle('Boxplots of Independent Variables NV', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
###endnevada
arizona = covid_df.loc[(covid_df['abbrev'] == 'AZ') & (covid_df['state']== 'Arizona')]
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(arizona.date, arizona.hospitalizedCurrently)
plt.title('Number of Patients in AZ Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
# Checking which cols have NaN values
arizona[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
arizona.head()
# Omit the NaN cols
arizona = arizona[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]
# Scatter plots AZ
# Split dependent var from independent variables
target_az = arizona.hospitalizedCurrently
indep_var_az = arizona.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_az.columns):
ax=fig.add_subplot(2, 3, i+1)
sns.regplot(x=indep_var_az[col], y=target_az, data=indep_var_az, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
plt.suptitle('Distributions of Independent Variables AZ', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of AZ
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(arizona.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.boxplot(x=arizona[col], data=arizona, color='lightpink', showfliers=True)
plt.suptitle('Boxplots of Independent Variables AZ', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
###endarizona
mississippi = covid_df.loc[(covid_df['abbrev'] == 'MS') & (covid_df['state']== 'Mississippi')]
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(mississippi.date, mississippi.hospitalizedCurrently)
plt.title('Number of Patients in MS Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
# Checking which cols have NaN values
mississippi[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
mississippi.head()
# Omit the NaN cols
mississippi = mississippi[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]
# Scatter plots MS
# Split dependent var from independent variables
target_ms = texas.hospitalizedCurrently
indep_var_ms = texas.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_ms.columns):
ax=fig.add_subplot(2, 3, i+1)
sns.regplot(x=indep_var_ms[col], y=target_ms, data=indep_var_ms, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
plt.suptitle('Distributions of Independent Variables MS', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of MS
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(texas.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.boxplot(x=mississippi[col], data=mississippi, color='lightpink', showfliers=True)
plt.suptitle('Boxplots of Independent Variables MS', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
###endmississippi
texas = covid_df.loc[(covid_df['abbrev'] == 'TX') & (covid_df['state']== 'Texas')]
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(texas.date, texas.hospitalizedCurrently)
plt.title('Number of Patients in TX Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
# Checking which cols have NaN values
texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
texas.head()
# Omit the NaN cols
texas = texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]
# Scatter plots TX
# Split dependent var from independent variables
target_tx = texas.hospitalizedCurrently
indep_var_tx = texas.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_tx.columns):
ax=fig.add_subplot(2, 3, i+1)
sns.regplot(x=indep_var_tx[col], y=target_tx, data=indep_var_tx, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
plt.suptitle('Distributions of Independent Variables TX', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of TX
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(texas.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.boxplot(x=texas[col], data=texas, color='lightpink', showfliers=True)
plt.suptitle('Boxplots of Independent Variables TX', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
###endutah
texas = covid_df.loc[(covid_df['abbrev'] == 'TX') & (covid_df['state']== 'Texas')]
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(texas.date, texas.hospitalizedCurrently)
plt.title('Number of Patients in TX Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
# Checking which cols have NaN values
texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
texas.head()
# Omit the NaN cols
texas = texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]
# Scatter plots TX
# Split dependent var from independent variables
target_tx = texas.hospitalizedCurrently
indep_var_tx = texas.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_tx.columns):
ax=fig.add_subplot(2, 3, i+1)
sns.regplot(x=indep_var_tx[col], y=target_tx, data=indep_var_tx, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
plt.suptitle('Distributions of Independent Variables TX', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of TX
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(texas.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.boxplot(x=texas[col], data=texas, color='lightpink', showfliers=True)
plt.suptitle('Boxplots of Independent Variables TX', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
###endgeorgia
texas = covid_df.loc[(covid_df['abbrev'] == 'TX') & (covid_df['state']== 'Texas')]
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(texas.date, texas.hospitalizedCurrently)
plt.title('Number of Patients in TX Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
# Checking which cols have NaN values
texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
texas.head()
# Omit the NaN cols
texas = texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]
# Scatter plots TX
# Split dependent var from independent variables
target_tx = texas.hospitalizedCurrently
indep_var_tx = texas.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_tx.columns):
ax=fig.add_subplot(2, 3, i+1)
sns.regplot(x=indep_var_tx[col], y=target_tx, data=indep_var_tx, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
plt.suptitle('Distributions of Independent Variables TX', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of TX
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(texas.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.boxplot(x=texas[col], data=texas, color='lightpink', showfliers=True)
plt.suptitle('Boxplots of Independent Variables TX', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
###endalabama
texas = covid_df.loc[(covid_df['abbrev'] == 'TX') & (covid_df['state']== 'Texas')]
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(texas.date, texas.hospitalizedCurrently)
plt.title('Number of Patients in TX Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
# Checking which cols have NaN values
texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
texas.head()
# Omit the NaN cols
texas = texas[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]
# Scatter plots TX
# Split dependent var from independent variables
target_tx = texas.hospitalizedCurrently
indep_var_tx = texas.drop(columns=['hospitalizedCurrently'])
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var_tx.columns):
ax=fig.add_subplot(2, 3, i+1)
sns.regplot(x=indep_var_tx[col], y=target_tx, data=indep_var_tx, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
plt.suptitle('Distributions of Independent Variables TX', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of TX
fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(texas.columns):
ax=fig.add_subplot(3, 3, i+1)
sns.boxplot(x=texas[col], data=texas, color='lightpink', showfliers=True)
plt.suptitle('Boxplots of Independent Variables TX', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
###endoklahoma
# TODO add some explanation / look more into collinear variables
# Heatmap of correlations
# Save correlations to variable
corr = covid_cleaned.corr(method='pearson')
# We can create a mask to not show duplicate values
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(16,16))
# Generate heatmap
sns.heatmap(corr, annot=True, mask=mask, cmap='GnBu', center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
# We compare three models:
# - Polynomial Regression
# - Linear Regression
# - ElasticNet
# Copy DFs to not mess up original one
# We will use model_df for our regression model
model_df = all_cases.copy()
# Delete redundant rows
for row in ['abbrev', 'bedsPerThousand', 'hospitalized',
'state', 'hospitalizedCumulative', 'dataQualityGrade', 'lastUpdateEt']:
del model_df[row]
# Drop NaN values for hospitalizedCurrently
model_df = model_df.dropna(subset=['hospitalizedCurrently'])
# Drop Values with abnormal active-hospitalised ratios (outside Conf. Interval)
model_df['ratio_hospital'] = model_df['hospitalizedCurrently'] / model_df['active']
model_df = model_df[~(model_df['ratio_hospital'] >= model_df.ratio_hospital.quantile(0.99))]
#model_df = model_df[~(model_df['ratio_hospital'] <= model_df['ratio_hospital'].median())]
del model_df['ratio_hospital']
# Get peek of model to use
model_df.describe()